In [5]:
    
import numpy as np
import pandas as pd
import seaborn as sns
import pylab
import airbnb_pipeline
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
    
In [6]:
    
pwd
    
    Out[6]:
In [7]:
    
train = pd.read_csv("/Users/alexpapiu/Documents/Insight/Project/Data/new-york-city_2016-12-03_data_listings.csv")
    
In [4]:
    
train.head()
    
    Out[4]:
In [5]:
    
train.columns
    
    Out[5]:
In [9]:
    
train = airbnb_pipeline.clean(train)
    
    
In [10]:
    
train.shape
    
    Out[10]:
In [11]:
    
train["price"].hist(bins = 30)
    
    Out[11]:
    
In [12]:
    
plot = (train.pivot(columns = "room_type", values = "price")
         .plot.hist(bins = 25, stacked = False, alpha = 0.7))
    
    
In [13]:
    
(train.pivot(columns = "bedrooms", values = "price")
         .plot.hist(bins = 30, stacked = True))
    
    Out[13]:
    
In [9]:
    
train.host_verifications.head()
    
    Out[9]:
In [10]:
    
train.host_identity_verified.value_counts()
    
    Out[10]:
In [25]:
    
sns.boxplot(x = "host_identity_verified", y = "price", data = train)
    
    Out[25]:
    
In [26]:
    
sns.barplot(x = "host_has_profile_pic", y = "price", data = train)
    
    Out[26]:
    
In [27]:
    
train["host_has_profile_pic"].value_counts()
    
    Out[27]:
In [28]:
    
sns.barplot(x = "is_location_exact", y = "price", data = train)
    
    Out[28]:
    
In [29]:
    
['property_type', 'room_type', 'accommodates','bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'square_feet']
    
    Out[29]:
In [30]:
    
sns.barplot(x = "property_type", y = "price", data = train)
    
    Out[30]:
    
In [31]:
    
sns.barplot(x = "room_type", y = "price", data = train)
    
    Out[31]:
    
In [32]:
    
sns.barplot(x = "accommodates", y = "price", data = train)
    
    Out[32]:
    
In [33]:
    
sns.barplot(x = "bathrooms", y = "price", data = train)
    
    Out[33]:
    
In [34]:
    
train["bathrooms"].value_counts()
    
    Out[34]:
In [35]:
    
sns.barplot(x = "bedrooms", y = "price", data = train)
    
    Out[35]:
    
In [23]:
    
train["bedrooms"].value_counts()
    
    Out[23]:
In [40]:
    
#too many nans for square feet.
train.square_feet.isnull().sum()
    
    Out[40]:
In [ ]:
    
['review_scores_location', 'review_scores_value', 'requires_license',
       'license', 'jurisdiction_names', 'instant_bookable',
       'cancellation_policy', 'require_guest_profile_picture',
       'require_guest_phone_verification', 'calculated_host_listings_count',
       'reviews_per_month']
    
In [42]:
    
sns.lmplot("review_scores_location", "price", data = train)
    
    Out[42]:
    
In [43]:
    
sns.barplot("review_scores_location", "price", data = train)
    
    Out[43]:
    
In [44]:
    
sns.barplot("review_scores_value", "price", data = train)
    
    Out[44]:
    
In [51]:
    
sns.barplot("review_scores_rating", "price", data = train.query("review_scores_rating > 70"))
    
    Out[51]:
    
In [52]:
    
sns.barplot("instant_bookable", "price", data = train)
    
    Out[52]:
    
In [54]:
    
sns.barplot("require_guest_profile_picture", "price", data = train)
    
    Out[54]:
    
In [ ]: